import pandas as pd
import numpy as np
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from plotly.offline import iplot
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.metrics import silhouette_score
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
import ipywidgets as widgets
from keras.models import load_model
from ipywidgets import *
print("Libraries loaded!")
Libraries loaded!
# !jupyter nbextension enable – py widgetsnbextension – sys-prefix
# !jupyter serverextension enable voila – sys-prefix
print()
print("Loading data.....")
customer_data = pd.read_csv('E-Commerce_Data.csv', encoding='ISO-8859-1',dtype={'InvoiceID': str})
customer_data['InvoiceDate'] = pd.to_datetime(customer_data['InvoiceDate']) #convert to python datetime object
print("Data loaded!")
Loading data..... Data loaded!
print()
def check_data(dataframe):
print(" *********************************SHAPE******************************")
print(dataframe.shape)
print()
print()
print("*********************************COLUMNS******************************")
print(dataframe.columns)
print()
print()
print("**********************************TYPES*******************************")
print(dataframe.dtypes)
print()
print()
print("**********************************HEAD*******************************")
print(dataframe.head())
print()
print()
print("**********************************TAIL*******************************")
print(dataframe.tail())
print()
print()
print("*******************************DESCRIPTION***************************")
print(dataframe.describe().T)
print()
check_data(customer_data)
*********************************SHAPE******************************
(541909, 8)
*********************************COLUMNS******************************
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
'UnitPrice', 'CustomerID', 'Country'],
dtype='object')
**********************************TYPES*******************************
InvoiceNo object
StockCode object
Description object
Quantity int64
InvoiceDate datetime64[ns]
UnitPrice float64
CustomerID float64
Country object
dtype: object
**********************************HEAD*******************************
InvoiceNo StockCode Description Quantity \
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
1 536365 71053 WHITE METAL LANTERN 6
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
InvoiceDate UnitPrice CustomerID Country
0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom
1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom
3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom
**********************************TAIL*******************************
InvoiceNo StockCode Description Quantity \
541904 581587 22613 PACK OF 20 SPACEBOY NAPKINS 12
541905 581587 22899 CHILDREN'S APRON DOLLY GIRL 6
541906 581587 23254 CHILDRENS CUTLERY DOLLY GIRL 4
541907 581587 23255 CHILDRENS CUTLERY CIRCUS PARADE 4
541908 581587 22138 BAKING SET 9 PIECE RETROSPOT 3
InvoiceDate UnitPrice CustomerID Country
541904 2011-12-09 12:50:00 0.85 12680.0 France
541905 2011-12-09 12:50:00 2.10 12680.0 France
541906 2011-12-09 12:50:00 4.15 12680.0 France
541907 2011-12-09 12:50:00 4.15 12680.0 France
541908 2011-12-09 12:50:00 4.95 12680.0 France
*******************************DESCRIPTION***************************
count mean std min 25% 50% \
Quantity 541909.0 9.552250 218.081158 -80995.00 1.00 3.00
UnitPrice 541909.0 4.611114 96.759853 -11062.06 1.25 2.08
CustomerID 406829.0 15287.690570 1713.600303 12346.00 13953.00 15152.00
75% max
Quantity 10.00 80995.0
UnitPrice 4.13 38970.0
CustomerID 16791.00 18287.0
#Get the total number of missing values for each attribute
print()
print(customer_data.isnull().sum())
print()
InvoiceNo 0 StockCode 0 Description 1454 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 135080 Country 0 dtype: int64
#Check for repeated rows
print()
print("Duplicated values:", customer_data.duplicated().sum())
print()
Duplicated values: 5268
#Get the unique number of countries within the data set
print()
temp = customer_data.groupby(['CustomerID', 'InvoiceNo', 'Country']).count()
#temp = customer_data.groupby(['Country']).count()
temp = temp.reset_index(drop = False)
countries = temp['Country'].value_counts()
print('Number of countries in the dataframe: {}' .format(len(countries)))
Number of countries in the dataframe: 37
#Visualize the total number of orders for different countries
data = dict(type='choropleth',
locations = countries.index,
locationmode = 'country names', z = countries,
text = countries.index, colorbar = {'title':'Order nb.'},
colorscale=[[0, 'rgb(224,255,255)'],
[0.01, 'rgb(166,206,227)'], [0.02, 'rgb(31,120,180)'],
[0.03, 'rgb(178,223,138)'], [0.05, 'rgb(51,160,44)'],
[0.10, 'rgb(251,154,153)'], [0.20, 'rgb(255,255,0)'],
[1, 'rgb(227,26,28)']],
reversescale = False)
#_______________________
layout = dict(title='Number of orders per country',
geo = dict(showframe = True, projection={'type':'mercator'}))
#______________
choromap = go.Figure(data = [data], layout = layout)
iplot(choromap, validate=False)
print()
#Total number of customers, products and transactions
pd.DataFrame([{'products': len(customer_data['StockCode'].value_counts()),
'transactions': len(customer_data['InvoiceNo'].value_counts()),
'customers': len(customer_data['CustomerID'].value_counts()),
}], columns = ['products', 'transactions', 'customers'], index = ['quantity'])
| products | transactions | customers | |
|---|---|---|---|
| quantity | 4070 | 25900 | 4372 |
#Number of products purchased in every transaction
print()
temp = customer_data.groupby(by=['CustomerID', 'InvoiceNo'], as_index=False)['InvoiceDate'].count()
no_products_per_basket = temp.rename(columns = {'InvoiceDate':'Number of products'})
no_products_per_basket[:10].sort_values('CustomerID')
| CustomerID | InvoiceNo | Number of products | |
|---|---|---|---|
| 0 | 12346.0 | 541431 | 1 |
| 1 | 12346.0 | C541433 | 1 |
| 2 | 12347.0 | 537626 | 31 |
| 3 | 12347.0 | 542237 | 29 |
| 4 | 12347.0 | 549222 | 24 |
| 5 | 12347.0 | 556201 | 18 |
| 6 | 12347.0 | 562032 | 22 |
| 7 | 12347.0 | 573511 | 47 |
| 8 | 12347.0 | 581180 | 11 |
| 9 | 12348.0 | 539318 | 17 |
particular_code_list = customer_data[customer_data['StockCode'].str.contains(
'^[a-zA-Z]+', regex=True)]['StockCode'].unique()
particular_code_list
array(['POST', 'D', 'C2', 'DOT', 'M', 'BANK CHARGES', 'S', 'AMAZONFEE',
'DCGS0076', 'DCGS0003', 'gift_0001_40', 'DCGS0070', 'm',
'gift_0001_50', 'gift_0001_30', 'gift_0001_20', 'DCGS0055',
'DCGS0072', 'DCGS0074', 'DCGS0069', 'DCGS0057', 'DCGSSBOY',
'DCGSSGIRL', 'gift_0001_10', 'PADS', 'DCGS0004', 'DCGS0073',
'DCGS0071', 'DCGS0068', 'DCGS0067', 'DCGS0066P', 'B', 'CRUK'],
dtype=object)
particular_code_description = customer_data[customer_data['StockCode'] == 'B']
particular_code_description
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 299982 | A563185 | B | Adjust bad debt | 1 | 2011-08-12 14:50:00 | 11062.06 | NaN | United Kingdom |
| 299983 | A563186 | B | Adjust bad debt | 1 | 2011-08-12 14:51:00 | -11062.06 | NaN | United Kingdom |
| 299984 | A563187 | B | Adjust bad debt | 1 | 2011-08-12 14:52:00 | -11062.06 | NaN | United Kingdom |
#Fill the missing values
customer_data['CustomerID'].fillna(inplace = True,value=customer_data['CustomerID'].mean())
customer_data['Description'].fillna(inplace = True, value='')
print("Done!")
Done!
#Check if there are still missing values
customer_data.isnull().sum()
InvoiceNo 0 StockCode 0 Description 0 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 0 Country 0 dtype: int64
#remove duplicated rows
customer_data.drop_duplicates(inplace=True)
print("Done!")
Done!
#Check if there are still duplicates
print("Repeated entries:",customer_data.duplicated().sum())
Repeated entries: 0
print("Length of data set now", len(customer_data))
Length of data set now 536641
#Gather all orders that might indicate a cancelled order
cancelledOrders=customer_data[customer_data["Quantity"]<0]
cancelledOrders.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 141 | C536379 | D | Discount | -1 | 2010-12-01 09:41:00 | 27.50 | 14527.0 | United Kingdom |
| 154 | C536383 | 35004C | SET OF 3 COLOURED FLYING DUCKS | -1 | 2010-12-01 09:49:00 | 4.65 | 15311.0 | United Kingdom |
| 235 | C536391 | 22556 | PLASTERS IN TIN CIRCUS PARADE | -12 | 2010-12-01 10:24:00 | 1.65 | 17548.0 | United Kingdom |
| 236 | C536391 | 21984 | PACK OF 12 PINK PAISLEY TISSUES | -24 | 2010-12-01 10:24:00 | 0.29 | 17548.0 | United Kingdom |
| 237 | C536391 | 21983 | PACK OF 12 BLUE PAISLEY TISSUES | -24 | 2010-12-01 10:24:00 | 0.29 | 17548.0 | United Kingdom |
#Check if negative quantity correspond to cancelled transaction
display(customer_data.sort_values('CustomerID')[:5])
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 61619 | 541431 | 23166 | MEDIUM CERAMIC TOP STORAGE JAR | 74215 | 2011-01-18 10:01:00 | 1.04 | 12346.0 | United Kingdom |
| 61624 | C541433 | 23166 | MEDIUM CERAMIC TOP STORAGE JAR | -74215 | 2011-01-18 10:17:00 | 1.04 | 12346.0 | United Kingdom |
| 428981 | 573511 | 22992 | REVOLVER WOODEN RULER | 12 | 2011-10-31 12:25:00 | 1.95 | 12347.0 | Iceland |
| 429001 | 573511 | 20719 | WOODLAND CHARLOTTE BAG | 10 | 2011-10-31 12:25:00 | 0.85 | 12347.0 | Iceland |
| 429002 | 573511 | 23162 | REGENCY TEA STRAINER | 8 | 2011-10-31 12:25:00 | 3.75 | 12347.0 | Iceland |
#Inspect if all orders with negative quantity indicate cancelled orders
data_check = customer_data[customer_data['Quantity']<0][['CustomerID', 'Quantity', 'StockCode', 'Description', 'UnitPrice']]
for index, col in data_check.iterrows():
if customer_data[(customer_data['CustomerID'] == col[0]) & (customer_data['Quantity'] == -col[1]) &
(customer_data['Description'] == col[2])].shape[0] == 0:
print(data_check.loc[index])
print('Hypothesis NOT fulfilled')
break
CustomerID 14527.0 Quantity -1 StockCode D Description Discount UnitPrice 27.5 Name: 141, dtype: object Hypothesis NOT fulfilled
#Perform the previous check but ignore discount.
data_check = customer_data[(customer_data['Quantity']<0) & (customer_data['Description'] != 'Discount')][['CustomerID', 'Quantity', 'StockCode', 'Description', 'UnitPrice']]
for index, col in data_check.iterrows():
if customer_data[(customer_data['CustomerID'] == col[0]) & (customer_data['Quantity'] == -col[1]) &
(customer_data['Description'] == col[2])].shape[0] == 0:
print(data_check.loc[index])
print('HYPOTHESIS not fulfilled')
break
CustomerID 15311.0 Quantity -1 StockCode 35004C Description SET OF 3 COLOURED FLYING DUCKS UnitPrice 4.65 Name: 154, dtype: object HYPOTHESIS not fulfilled
#Gather all entries that relate to cancelled orders and store the quantity cancelled for each cancelled order
data_cleaned = customer_data.copy(deep =True)
data_cleaned['QuantityCanceled'] = 0
entry_to_remove = []
doubtful_entry = []
for index, col in customer_data.iterrows():
if (col['Quantity']> 0) or col['Description'] == 'Discount':
continue
data_test = customer_data[(customer_data['CustomerID'] == col['CustomerID']) & (customer_data['StockCode'] == col['StockCode'])
& (customer_data['InvoiceDate'] < col['InvoiceDate']) & (customer_data['Quantity'] > 0)].copy()
#**********************************************
#Cancellation without counterpart
if (data_test.shape[0] == 0):
doubtful_entry.append(index)
#Cancelation with a counterpart
elif (data_test.shape[0] == 1):
counterpart_index = data_test.index[0]
data_cleaned.loc[counterpart_index, 'QuantityCanceled'] = -col['Quantity']
entry_to_remove.append(index)
#Entries with several counterparts. We delete the last one
elif (data_test.shape[0]>1):
data_test.sort_index(axis=0, ascending=False, inplace=True)
for ind, val in data_test.iterrows():
if val['Quantity'] < -col['Quantity']: continue
data_cleaned.loc[ind, 'QuantityCanceled'] = -col['Quantity']
entry_to_remove.append(index)
break
print("Length of cancelled orders without a counterpart and ones with one or more counterpart:", len(doubtful_entry + entry_to_remove))
Length of cancelled orders without a counterpart and ones with one or more counterpart: 9771
#Remove entries that do not have a counterpart and ones that have atleast one counterpart
data_cleaned.drop(entry_to_remove, axis=0, inplace=True)
data_cleaned.drop(doubtful_entry, axis=0, inplace=True)
print("Done!")
Done!
#Check for entries that have negative quantity
remaining_entries = data_cleaned[(data_cleaned['Quantity']<0) & (data_cleaned['StockCode']!='D')]
print("Remaining entries to delete: {}".format(remaining_entries.shape[0]))
remaining_entries[:5]
Remaining entries to delete: 739
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | QuantityCanceled | |
|---|---|---|---|---|---|---|---|---|---|
| 7188 | 536996 | 22712 | -20 | 2010-12-03 15:30:00 | 0.0 | 15287.69057 | United Kingdom | 0 | |
| 7201 | 537009 | 84534B | -80 | 2010-12-03 15:38:00 | 0.0 | 15287.69057 | United Kingdom | 0 | |
| 7202 | 537010 | 22162 | -40 | 2010-12-03 15:38:00 | 0.0 | 15287.69057 | United Kingdom | 0 | |
| 7205 | 537013 | 35965 | -25 | 2010-12-03 15:40:00 | 0.0 | 15287.69057 | United Kingdom | 0 | |
| 7291 | 537027 | 18098C | -140 | 2010-12-03 16:36:00 | 0.0 | 15287.69057 | United Kingdom | 0 |
#Remove remaining supsicious entries
data_cleaned.drop(remaining_entries.index,axis=0,inplace=True)
print("Done!")
Done!
print("Number of entries to delete: {}".format(data_cleaned[(data_cleaned['Quantity']<0) & (data_cleaned['StockCode']!='D')].shape[0]))
remaining_entries[:5]
print("Length of data frame now: {}", len(data_cleaned))
Number of entries to delete: 0
Length of data frame now: {} 526131
#Compute total amount for each entry
data_cleaned['TotalPrice'] = data_cleaned['UnitPrice'] * (data_cleaned['Quantity'] - data_cleaned['QuantityCanceled'])
data_cleaned['TotalQuantity'] = data_cleaned['Quantity'] - data_cleaned['QuantityCanceled']
data_cleaned.sort_values('CustomerID')[:5]
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | QuantityCanceled | TotalPrice | TotalQuantity | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 61619 | 541431 | 23166 | MEDIUM CERAMIC TOP STORAGE JAR | 74215 | 2011-01-18 10:01:00 | 1.04 | 12346.0 | United Kingdom | 74215 | 0.00 | 0 |
| 535014 | 581180 | 23508 | MINI PLAYING CARDS DOLLY GIRL | 20 | 2011-12-07 15:52:00 | 0.42 | 12347.0 | Iceland | 0 | 8.40 | 20 |
| 535011 | 581180 | 21265 | PINK GOOSE FEATHER TREE 60CM | 12 | 2011-12-07 15:52:00 | 1.95 | 12347.0 | Iceland | 0 | 23.40 | 12 |
| 14968 | 537626 | 20782 | CAMOUFLAGE EAR MUFF HEADPHONES | 6 | 2010-12-07 14:57:00 | 5.49 | 12347.0 | Iceland | 0 | 32.94 | 6 |
| 286621 | 562032 | 23308 | SET OF 60 VINTAGE LEAF CAKE CASES | 24 | 2011-08-02 08:48:00 | 0.55 | 12347.0 | Iceland | 0 | 13.20 | 24 |
#Sum of purchases/ user & order
order_total_df = data_cleaned.groupby(by= ['CustomerID', 'InvoiceNo', 'InvoiceDate'], as_index=False)['TotalPrice', 'TotalQuantity'].sum()
basket_price = order_total_df.rename(columns= {'TotalPrice' :'Basket Price'})
#Selection of important entries
basket_price=basket_price[basket_price['Basket Price']>0]
basket_price.sort_values('CustomerID')[:6]
C:\Users\glori\AppData\Local\Temp\ipykernel_10432\2682052397.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
| CustomerID | InvoiceNo | InvoiceDate | Basket Price | TotalQuantity | |
|---|---|---|---|---|---|
| 1 | 12347.0 | 537626 | 2010-12-07 14:57:00 | 711.79 | 319 |
| 2 | 12347.0 | 542237 | 2011-01-26 14:30:00 | 475.39 | 315 |
| 3 | 12347.0 | 549222 | 2011-04-07 10:43:00 | 636.25 | 483 |
| 4 | 12347.0 | 556201 | 2011-06-09 13:01:00 | 382.52 | 196 |
| 5 | 12347.0 | 562032 | 2011-08-02 08:48:00 | 584.91 | 277 |
| 6 | 12347.0 | 573511 | 2011-10-31 12:25:00 | 1294.32 | 676 |
basket_price.loc[basket_price['Basket Price'].idxmax()]
price_range = [0, 50, 100, 200, 500, 1000, 5000, 50000]
count_price = []
for i, price in enumerate(price_range):
if i==0: continue
items_count = basket_price[(basket_price['Basket Price'] < price) &
(basket_price['Basket Price']>price_range[i-1])]['Basket Price'].count()
count_price.append(items_count)
#*******************************************************************
#Representation of the purchases amount
plt.rc('font', weight='bold')
f, ax = plt.subplots(figsize=(10,7))
labels = ['{}<.<{}'.format(price_range[i-1], s) for i,s in enumerate(price_range) if i!=0]
sizes = count_price
explode = [0.0 if sizes[i] < 100 else 0.0 for i in range(len(sizes))]
ax.pie(sizes, explode = explode, labels=labels, autopct = lambda x:'{:1.0f}%'.format(x)
if x>1 else '', shadow =False, startangle=0)
ax.axis('equal')
f.text(0.5, 1.01, "Representation of the purchases amount",
ha = 'center', fontsize = 18);
#Compute how recent a single customer performed a transaction
df_recency = basket_price.groupby(by='CustomerID',
as_index=False)['InvoiceDate'].max()
df_recency.columns = ['CustomerID', 'LastPurchaseDate']
recent_date = df_recency['LastPurchaseDate'].max()
df_recency['Recency'] = df_recency['LastPurchaseDate'].apply(
lambda x: (recent_date - x).days)
df_recency
| CustomerID | LastPurchaseDate | Recency | |
|---|---|---|---|
| 0 | 12347.0 | 2011-12-07 15:52:00 | 1 |
| 1 | 12348.0 | 2011-09-25 13:13:00 | 74 |
| 2 | 12349.0 | 2011-11-21 09:51:00 | 18 |
| 3 | 12350.0 | 2011-02-02 16:01:00 | 309 |
| 4 | 12352.0 | 2011-11-03 14:37:00 | 35 |
| ... | ... | ... | ... |
| 4323 | 18280.0 | 2011-03-07 09:52:00 | 277 |
| 4324 | 18281.0 | 2011-06-12 10:53:00 | 180 |
| 4325 | 18282.0 | 2011-12-02 11:43:00 | 7 |
| 4326 | 18283.0 | 2011-12-06 12:02:00 | 3 |
| 4327 | 18287.0 | 2011-10-28 09:29:00 | 42 |
4328 rows × 3 columns
#Compute the number of times a customer has made a transaction
frequency_df = basket_price.groupby(
by=['CustomerID'], as_index=False)['InvoiceDate'].count()
frequency_df.columns = ['CustomerID', 'Frequency']
frequency_df.head()
| CustomerID | Frequency | |
|---|---|---|
| 0 | 12347.0 | 7 |
| 1 | 12348.0 | 4 |
| 2 | 12349.0 | 1 |
| 3 | 12350.0 | 1 |
| 4 | 12352.0 | 7 |
#Compute the total amount spent by a customer within the given time.
monetary_df = basket_price.groupby(by='CustomerID', as_index=False)['Basket Price'].sum()
monetary_df.columns = ['CustomerID', 'Monetary']
monetary_df.head()
| CustomerID | Monetary | |
|---|---|---|
| 0 | 12347.0 | 4310.00 |
| 1 | 12348.0 | 1797.24 |
| 2 | 12349.0 | 1757.55 |
| 3 | 12350.0 | 334.40 |
| 4 | 12352.0 | 2385.71 |
#Generate the RFM data set
rf_df = df_recency.merge(frequency_df, on='CustomerID')
rfm_df = rf_df.merge(monetary_df, on='CustomerID').drop(
columns='LastPurchaseDate')
rfm_df.head()
| CustomerID | Recency | Frequency | Monetary | |
|---|---|---|---|---|
| 0 | 12347.0 | 1 | 7 | 4310.00 |
| 1 | 12348.0 | 74 | 4 | 1797.24 |
| 2 | 12349.0 | 18 | 1 | 1757.55 |
| 3 | 12350.0 | 309 | 1 | 334.40 |
| 4 | 12352.0 | 35 | 7 | 2385.71 |
plt.figure(figsize=(12,12))
plt.title("RFM variables distribution")
rfm_df.boxplot()
<AxesSubplot:title={'center':'RFM variables distribution'}>
#Data normalization
print("Scaling data....")
rfm_normalized = rfm_df[['Recency', 'Frequency', 'Monetary']]
scaler = StandardScaler()
scaler.fit(rfm_normalized)
rfm_normalized = pd.DataFrame(scaler.transform(rfm_normalized), columns=rfm_normalized.columns)
print("Done!")
Scaling data.... Done!
rfm_normalized
| Recency | Frequency | Monetary | |
|---|---|---|---|
| 0 | -0.905229 | 0.107094 | 0.072829 |
| 1 | -0.176468 | -0.025766 | -0.019727 |
| 2 | -0.735517 | -0.158625 | -0.021189 |
| 3 | 2.169545 | -0.158625 | -0.073610 |
| 4 | -0.565806 | 0.107094 | 0.001949 |
| ... | ... | ... | ... |
| 4323 | 1.850088 | -0.158625 | -0.079276 |
| 4324 | 0.881734 | -0.158625 | -0.082951 |
| 4325 | -0.845331 | -0.114339 | -0.079423 |
| 4326 | -0.885263 | 0.505672 | -0.010582 |
| 4327 | -0.495925 | -0.070052 | -0.018253 |
4328 rows × 3 columns
#Check for outliers after scaling
plt.figure(figsize=(12,12))
plt.title("Outlier variable distribution")
rfm_normalized.boxplot()
<AxesSubplot:title={'center':'Outlier variable distribution'}>
#Outlier information
rfm_normalized[rfm_normalized['Monetary']>60]
| Recency | Frequency | Monetary | |
|---|---|---|---|
| 2153 | -0.915212 | 61.886723 | 62.558306 |
#Remove outlier information
print()
indexID = rfm_normalized[rfm_normalized['Monetary'] > 60].index
rfm_normalized.drop(indexID, inplace=True)
print("Done!")
Done!
# Recheck the distribution of data
plt.figure(figsize=(12,12))
plt.title("RFM variable distribution")
rfm_normalized.boxplot()
<AxesSubplot:title={'center':'RFM variable distribution'}>
#Compute the optimal numer of clusters by
inertia = []
num_clusters =range(2,8)
for i in num_clusters:
kmeans=KMeans(n_clusters=i, max_iter=50)
kmeans.fit(rfm_normalized)
inertia.append(kmeans.inertia_)
plt.figure(figsize=(16,8))
plt.plot(num_clusters, inertia, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('inertia')
plt.title('The Elbow Method showing the optimal k')
plt.show()
print()
#Silhouette analysis
print("Calculating silhouette scores for different values of k.....")
for k in num_clusters:
#initialize kmeans
kmeans = KMeans(n_clusters=k, max_iter=50)
kmeans.fit(rfm_normalized)
cluster_labels = kmeans.labels_
#silhouette score
silhouette_avg = silhouette_score(rfm_normalized, cluster_labels)
print("For n_clusters={0}, the silhouette score is {1}".format(k, silhouette_avg))
Calculating silhouette scores for different values of k..... For n_clusters=2, the silhouette score is 0.6873406120911523 For n_clusters=3, the silhouette score is 0.5845143260289866 For n_clusters=4, the silhouette score is 0.6030597736942671 For n_clusters=5, the silhouette score is 0.492318872510545 For n_clusters=6, the silhouette score is 0.5060299893063445 For n_clusters=7, the silhouette score is 0.5185848606534478
#Visulaization of silhoutte analysis
print("Visulization of silhouette scores against clustered data for different values of k")
for i, k in enumerate([2, 3, 4, 5, 6]):
fig, (ax1, ax2) = plt.subplots(1, 2,figsize=(15,5))
#fig.set_size_inches(15, 5)
# Run the Kmeans algorithm
km=KMeans(n_clusters=k)
labels = km.fit_predict(rfm_normalized)
centroids = km.cluster_centers_
# Get silhouette samples
silhouette_vals = silhouette_samples(rfm_normalized, labels)
# Silhouette plot
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(np.unique(labels)):
cluster_silhouette_vals = silhouette_vals[labels == cluster]
cluster_silhouette_vals.sort()
y_upper += len(cluster_silhouette_vals)
ax1.barh(range(y_lower, y_upper), cluster_silhouette_vals, edgecolor='none', height=1)
ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
y_lower += len(cluster_silhouette_vals)
# Get the average silhouette score and plot it
avg_score = np.mean(silhouette_vals)
ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
ax1.set_yticks([])
ax1.set_xlim([-0.1, 1])
ax1.set_xlabel('Silhouette coefficient values')
ax1.set_ylabel('Cluster labels')
ax1.set_title('Silhouette plot for the various clusters', y=1.02);
# Scatter plot of data colored with labels
ax2.scatter(rfm_normalized['Monetary'], rfm_normalized['Recency'], c=labels)
ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
# ax2.set_xlim([-2, 2])
# ax2.set_xlim([-2, 2])
ax2.set_xlabel('Monetary value')
ax2.set_ylabel('Recency')
ax2.set_title('Visualization of clustered data', y=1.02)
ax2.set_aspect('equal')
plt.tight_layout()
plt.suptitle(f'Silhouette analysis using k = {k}',
fontsize=16, fontweight='semibold', y=1.05);
Visulization of silhouette scores against clustered data for different values of k
#Segmentation of customers
kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(rfm_normalized)
KMeans(max_iter=50, n_clusters=4)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(max_iter=50, n_clusters=4)
rfm_normalized.loc[:, 'CustomerID'] = rfm_df['CustomerID']
rfm_normalized
| Recency | Frequency | Monetary | CustomerID | |
|---|---|---|---|---|
| 0 | -0.905229 | 0.107094 | 0.072829 | 12347.0 |
| 1 | -0.176468 | -0.025766 | -0.019727 | 12348.0 |
| 2 | -0.735517 | -0.158625 | -0.021189 | 12349.0 |
| 3 | 2.169545 | -0.158625 | -0.073610 | 12350.0 |
| 4 | -0.565806 | 0.107094 | 0.001949 | 12352.0 |
| ... | ... | ... | ... | ... |
| 4323 | 1.850088 | -0.158625 | -0.079276 | 18280.0 |
| 4324 | 0.881734 | -0.158625 | -0.082951 | 18281.0 |
| 4325 | -0.845331 | -0.114339 | -0.079423 | 18282.0 |
| 4326 | -0.885263 | 0.505672 | -0.010582 | 18283.0 |
| 4327 | -0.495925 | -0.070052 | -0.018253 | 18287.0 |
4327 rows × 4 columns
rfm_normalized['Cluster'] = kmeans.labels_
rfm_normalized
| Recency | Frequency | Monetary | CustomerID | Cluster | |
|---|---|---|---|---|---|
| 0 | -0.905229 | 0.107094 | 0.072829 | 12347.0 | 0 |
| 1 | -0.176468 | -0.025766 | -0.019727 | 12348.0 | 0 |
| 2 | -0.735517 | -0.158625 | -0.021189 | 12349.0 | 0 |
| 3 | 2.169545 | -0.158625 | -0.073610 | 12350.0 | 1 |
| 4 | -0.565806 | 0.107094 | 0.001949 | 12352.0 | 0 |
| ... | ... | ... | ... | ... | ... |
| 4323 | 1.850088 | -0.158625 | -0.079276 | 18280.0 | 1 |
| 4324 | 0.881734 | -0.158625 | -0.082951 | 18281.0 | 2 |
| 4325 | -0.845331 | -0.114339 | -0.079423 | 18282.0 | 0 |
| 4326 | -0.885263 | 0.505672 | -0.010582 | 18283.0 | 0 |
| 4327 | -0.495925 | -0.070052 | -0.018253 | 18287.0 | 0 |
4327 rows × 5 columns
rfm_normalized['Cluster'].value_counts()
0 2875 2 803 1 626 3 23 Name: Cluster, dtype: int64
plt.figure(figsize=(12,12))
plt.title("Recency variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Recency', data=rfm_normalized)
<AxesSubplot:title={'center':'Recency variable distribution within each cluster'}, xlabel='Cluster', ylabel='Recency'>
plt.figure(figsize=(12,12))
plt.title("Frequency variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Frequency', data=rfm_normalized)
<AxesSubplot:title={'center':'Frequency variable distribution within each cluster'}, xlabel='Cluster', ylabel='Frequency'>
plt.figure(figsize=(12,12))
plt.title("Monetary variable distribution within each cluster")
sns.boxplot(x='Cluster', y='Monetary', data=rfm_normalized)
<AxesSubplot:title={'center':'Monetary variable distribution within each cluster'}, xlabel='Cluster', ylabel='Monetary'>
plt.figure(figsize=(12,12))
plt.title("Clustering: Recency vs Monetary")
RM=sns.scatterplot(x='Recency', y='Monetary', hue='Cluster', palette="Set2", data=rfm_normalized)
plt.figure(figsize=(12,12))
plt.title("Clustering: Frequency vs Monetary")
FM=sns.scatterplot(x='Frequency', y='Monetary', hue='Cluster',palette="Set2", data=rfm_normalized)
plt.figure(figsize=(12,12))
plt.title("Clustering: Recency vs Frequency")
RF=sns.scatterplot(x='Recency', y='Frequency', hue='Cluster', palette="Set2", data=rfm_normalized)
plt.rcParams["figure.figsize"] = (25,25)
fig = plt.figure(1)
plt.clf()
ax = Axes3D(fig, rect = [0, 0, .95, 1],
elev = 48,
azim = 134)
plt.cla()
ax.scatter(rfm_normalized['Recency'], rfm_normalized['Frequency'], rfm_normalized['Monetary'],
c = rfm_normalized['Cluster'],
s = 200,
cmap = "spring",
alpha = 0.5,
edgecolor = 'darkgrey')
ax.set_xlabel('Recency',
fontsize = 16)
ax.set_ylabel('Frequency',
fontsize = 16)
ax.set_zlabel('Monetary',
fontsize = 16)
plt.show()
# import plotly.express as px
# fig2 = px.scatter_3d(rfm_normalized, x="Frequency", y="Recency", z="Monetary", color="Cluster",)
# fig2.update_layout(title="3 Features Representation")
# fig2.show()
C:\Users\glori\AppData\Local\Temp\ipykernel_10432\3530151970.py:4: MatplotlibDeprecationWarning: Axes3D(fig) adding itself to the figure is deprecated since 3.4. Pass the keyword argument auto_add_to_figure=False and use fig.add_axes(ax) to suppress this warning. The default value of auto_add_to_figure will change to False in mpl3.5 and True values will no longer work in 3.6. This is consistent with other Axes classes.
#Selection and distribution of independent and dependent variables
X = rfm_normalized.iloc[:,:3].values
X[:10]
array([[-9.05228987e-01, 1.07093859e-01, 7.28288079e-02],
[-1.76467627e-01, -2.57655586e-02, -1.97274636e-02],
[-7.35517437e-01, -1.58624976e-01, -2.11894251e-02],
[ 2.16954497e+00, -1.58624976e-01, -7.36104514e-02],
[-5.65805888e-01, 1.07093859e-01, 1.94853772e-03],
[ 1.11134354e+00, -1.58624976e-01, -8.26496390e-02],
[ 1.39086845e+00, -1.58624976e-01, -4.61687449e-02],
[ 1.21117387e+00, -1.58624976e-01, -6.90061382e-02],
[-6.95585308e-01, -7.00520310e-02, 1.76297233e-02],
[-5.95754985e-01, -1.58624976e-01, 1.42728544e-01]])
Y = rfm_normalized.iloc[:,4:5].values
Y[:10]
array([[0],
[0],
[0],
[1],
[0],
[2],
[1],
[2],
[0],
[0]])
#One hot encode
ohe = OneHotEncoder()
Y = ohe.fit_transform(Y).toarray()
print('One hot encoded array:')
print(Y[0:5])
One hot encoded array: [[1. 0. 0. 0.] [1. 0. 0. 0.] [1. 0. 0. 0.] [0. 1. 0. 0.] [1. 0. 0. 0.]]
#Train test split of model
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.1,random_state = 0)
print("Done!")
Done!
#Defining the model
model = Sequential()
model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu')) #Input layer with 16 units
model.add(Dense(4, activation='relu')) #Hidden layer with 12 units
model.add(Dense(4, activation='softmax'))#Middle layer with 4 units
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 3) 12
dense_1 (Dense) (None, 4) 16
dense_2 (Dense) (None, 4) 20
=================================================================
Total params: 48
Trainable params: 48
Non-trainable params: 0
_________________________________________________________________
model.layers[0].output
<KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'dense')>
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=100, batch_size=64, validation_split = 0.1)
Epoch 1/100 55/55 [==============================] - 1s 4ms/step - loss: 1.3834 - accuracy: 0.2446 - val_loss: 1.3150 - val_accuracy: 0.3872 Epoch 2/100 55/55 [==============================] - 0s 2ms/step - loss: 1.2735 - accuracy: 0.5950 - val_loss: 1.2236 - val_accuracy: 0.8077 Epoch 3/100 55/55 [==============================] - 0s 2ms/step - loss: 1.1934 - accuracy: 0.8051 - val_loss: 1.1491 - val_accuracy: 0.8154 Epoch 4/100 55/55 [==============================] - 0s 2ms/step - loss: 1.1231 - accuracy: 0.8074 - val_loss: 1.0802 - val_accuracy: 0.8179 Epoch 5/100 55/55 [==============================] - 0s 2ms/step - loss: 1.0570 - accuracy: 0.8085 - val_loss: 1.0150 - val_accuracy: 0.8205 Epoch 6/100 55/55 [==============================] - 0s 1ms/step - loss: 0.9941 - accuracy: 0.8088 - val_loss: 0.9537 - val_accuracy: 0.8205 Epoch 7/100 55/55 [==============================] - 0s 2ms/step - loss: 0.9353 - accuracy: 0.8111 - val_loss: 0.8971 - val_accuracy: 0.8308 Epoch 8/100 55/55 [==============================] - 0s 2ms/step - loss: 0.8806 - accuracy: 0.8225 - val_loss: 0.8450 - val_accuracy: 0.8462 Epoch 9/100 55/55 [==============================] - 0s 2ms/step - loss: 0.8307 - accuracy: 0.8459 - val_loss: 0.7978 - val_accuracy: 0.8590 Epoch 10/100 55/55 [==============================] - 0s 1ms/step - loss: 0.7851 - accuracy: 0.8527 - val_loss: 0.7550 - val_accuracy: 0.8692 Epoch 11/100 55/55 [==============================] - 0s 2ms/step - loss: 0.7435 - accuracy: 0.8664 - val_loss: 0.7157 - val_accuracy: 0.8744 Epoch 12/100 55/55 [==============================] - 0s 1ms/step - loss: 0.7053 - accuracy: 0.8710 - val_loss: 0.6795 - val_accuracy: 0.8769 Epoch 13/100 55/55 [==============================] - 0s 2ms/step - loss: 0.6700 - accuracy: 0.8821 - val_loss: 0.6459 - val_accuracy: 0.8769 Epoch 14/100 55/55 [==============================] - 0s 2ms/step - loss: 0.6371 - accuracy: 0.8881 - val_loss: 0.6147 - val_accuracy: 0.8897 Epoch 15/100 55/55 [==============================] - 0s 2ms/step - loss: 0.6063 - accuracy: 0.8993 - val_loss: 0.5852 - val_accuracy: 0.8923 Epoch 16/100 55/55 [==============================] - 0s 2ms/step - loss: 0.5774 - accuracy: 0.9075 - val_loss: 0.5574 - val_accuracy: 0.9103 Epoch 17/100 55/55 [==============================] - 0s 2ms/step - loss: 0.5500 - accuracy: 0.9150 - val_loss: 0.5309 - val_accuracy: 0.9103 Epoch 18/100 55/55 [==============================] - 0s 2ms/step - loss: 0.5238 - accuracy: 0.9232 - val_loss: 0.5060 - val_accuracy: 0.9103 Epoch 19/100 55/55 [==============================] - 0s 2ms/step - loss: 0.4988 - accuracy: 0.9289 - val_loss: 0.4818 - val_accuracy: 0.9282 Epoch 20/100 55/55 [==============================] - 0s 2ms/step - loss: 0.4748 - accuracy: 0.9386 - val_loss: 0.4585 - val_accuracy: 0.9333 Epoch 21/100 55/55 [==============================] - 0s 2ms/step - loss: 0.4518 - accuracy: 0.9449 - val_loss: 0.4362 - val_accuracy: 0.9410 Epoch 22/100 55/55 [==============================] - 0s 1ms/step - loss: 0.4299 - accuracy: 0.9521 - val_loss: 0.4147 - val_accuracy: 0.9385 Epoch 23/100 55/55 [==============================] - 0s 2ms/step - loss: 0.4082 - accuracy: 0.9609 - val_loss: 0.3944 - val_accuracy: 0.9410 Epoch 24/100 55/55 [==============================] - 0s 2ms/step - loss: 0.3876 - accuracy: 0.9626 - val_loss: 0.3745 - val_accuracy: 0.9538 Epoch 25/100 55/55 [==============================] - 0s 2ms/step - loss: 0.3682 - accuracy: 0.9666 - val_loss: 0.3559 - val_accuracy: 0.9615 Epoch 26/100 55/55 [==============================] - 0s 1ms/step - loss: 0.3498 - accuracy: 0.9689 - val_loss: 0.3385 - val_accuracy: 0.9590 Epoch 27/100 55/55 [==============================] - 0s 2ms/step - loss: 0.3326 - accuracy: 0.9729 - val_loss: 0.3220 - val_accuracy: 0.9641 Epoch 28/100 55/55 [==============================] - 0s 2ms/step - loss: 0.3167 - accuracy: 0.9746 - val_loss: 0.3065 - val_accuracy: 0.9692 Epoch 29/100 55/55 [==============================] - 0s 2ms/step - loss: 0.3015 - accuracy: 0.9737 - val_loss: 0.2925 - val_accuracy: 0.9641 Epoch 30/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2874 - accuracy: 0.9789 - val_loss: 0.2788 - val_accuracy: 0.9744 Epoch 31/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2742 - accuracy: 0.9812 - val_loss: 0.2663 - val_accuracy: 0.9744 Epoch 32/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2622 - accuracy: 0.9809 - val_loss: 0.2545 - val_accuracy: 0.9821 Epoch 33/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2508 - accuracy: 0.9814 - val_loss: 0.2434 - val_accuracy: 0.9821 Epoch 34/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2404 - accuracy: 0.9812 - val_loss: 0.2331 - val_accuracy: 0.9846 Epoch 35/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2304 - accuracy: 0.9840 - val_loss: 0.2238 - val_accuracy: 0.9872 Epoch 36/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2211 - accuracy: 0.9834 - val_loss: 0.2147 - val_accuracy: 0.9872 Epoch 37/100 55/55 [==============================] - 0s 1ms/step - loss: 0.2128 - accuracy: 0.9857 - val_loss: 0.2069 - val_accuracy: 0.9872 Epoch 38/100 55/55 [==============================] - 0s 2ms/step - loss: 0.2048 - accuracy: 0.9832 - val_loss: 0.1987 - val_accuracy: 0.9897 Epoch 39/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1975 - accuracy: 0.9840 - val_loss: 0.1928 - val_accuracy: 0.9795 Epoch 40/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1905 - accuracy: 0.9857 - val_loss: 0.1853 - val_accuracy: 0.9872 Epoch 41/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1840 - accuracy: 0.9863 - val_loss: 0.1789 - val_accuracy: 0.9872 Epoch 42/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1779 - accuracy: 0.9860 - val_loss: 0.1723 - val_accuracy: 0.9974 Epoch 43/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1723 - accuracy: 0.9857 - val_loss: 0.1670 - val_accuracy: 0.9872 Epoch 44/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1668 - accuracy: 0.9846 - val_loss: 0.1617 - val_accuracy: 0.9923 Epoch 45/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1619 - accuracy: 0.9874 - val_loss: 0.1565 - val_accuracy: 0.9949 Epoch 46/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1573 - accuracy: 0.9852 - val_loss: 0.1519 - val_accuracy: 0.9974 Epoch 47/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1525 - accuracy: 0.9860 - val_loss: 0.1479 - val_accuracy: 0.9949 Epoch 48/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1485 - accuracy: 0.9854 - val_loss: 0.1437 - val_accuracy: 0.9949 Epoch 49/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1444 - accuracy: 0.9874 - val_loss: 0.1398 - val_accuracy: 0.9923 Epoch 50/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1407 - accuracy: 0.9860 - val_loss: 0.1350 - val_accuracy: 0.9974 Epoch 51/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1371 - accuracy: 0.9869 - val_loss: 0.1322 - val_accuracy: 0.9949 Epoch 52/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1334 - accuracy: 0.9869 - val_loss: 0.1282 - val_accuracy: 0.9949 Epoch 53/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1302 - accuracy: 0.9869 - val_loss: 0.1248 - val_accuracy: 0.9974 Epoch 54/100 55/55 [==============================] - 0s 2ms/step - loss: 0.1272 - accuracy: 0.9892 - val_loss: 0.1217 - val_accuracy: 0.9974 Epoch 55/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1242 - accuracy: 0.9869 - val_loss: 0.1195 - val_accuracy: 0.9949 Epoch 56/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1215 - accuracy: 0.9874 - val_loss: 0.1159 - val_accuracy: 0.9949 Epoch 57/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1189 - accuracy: 0.9883 - val_loss: 0.1133 - val_accuracy: 0.9974 Epoch 58/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1163 - accuracy: 0.9883 - val_loss: 0.1107 - val_accuracy: 0.9974 Epoch 59/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1138 - accuracy: 0.9883 - val_loss: 0.1085 - val_accuracy: 0.9974 Epoch 60/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1115 - accuracy: 0.9886 - val_loss: 0.1056 - val_accuracy: 0.9923 Epoch 61/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1097 - accuracy: 0.9877 - val_loss: 0.1034 - val_accuracy: 0.9974 Epoch 62/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1074 - accuracy: 0.9892 - val_loss: 0.1013 - val_accuracy: 0.9974 Epoch 63/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1053 - accuracy: 0.9889 - val_loss: 0.0999 - val_accuracy: 0.9974 Epoch 64/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1035 - accuracy: 0.9889 - val_loss: 0.0981 - val_accuracy: 0.9949 Epoch 65/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1015 - accuracy: 0.9880 - val_loss: 0.0954 - val_accuracy: 0.9949 Epoch 66/100 55/55 [==============================] - 0s 1ms/step - loss: 0.1002 - accuracy: 0.9872 - val_loss: 0.0937 - val_accuracy: 0.9974 Epoch 67/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0984 - accuracy: 0.9872 - val_loss: 0.0914 - val_accuracy: 0.9923 Epoch 68/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0968 - accuracy: 0.9872 - val_loss: 0.0903 - val_accuracy: 0.9974 Epoch 69/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0953 - accuracy: 0.9883 - val_loss: 0.0883 - val_accuracy: 0.9923 Epoch 70/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0937 - accuracy: 0.9883 - val_loss: 0.0871 - val_accuracy: 0.9974 Epoch 71/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0927 - accuracy: 0.9857 - val_loss: 0.0857 - val_accuracy: 0.9974 Epoch 72/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0909 - accuracy: 0.9874 - val_loss: 0.0843 - val_accuracy: 0.9974 Epoch 73/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0898 - accuracy: 0.9880 - val_loss: 0.0828 - val_accuracy: 0.9974 Epoch 74/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0886 - accuracy: 0.9883 - val_loss: 0.0826 - val_accuracy: 0.9974 Epoch 75/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0872 - accuracy: 0.9883 - val_loss: 0.0805 - val_accuracy: 0.9974 Epoch 76/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0860 - accuracy: 0.9880 - val_loss: 0.0792 - val_accuracy: 0.9974 Epoch 77/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0850 - accuracy: 0.9897 - val_loss: 0.0787 - val_accuracy: 0.9974 Epoch 78/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0839 - accuracy: 0.9877 - val_loss: 0.0769 - val_accuracy: 0.9974 Epoch 79/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0830 - accuracy: 0.9894 - val_loss: 0.0762 - val_accuracy: 0.9974 Epoch 80/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0822 - accuracy: 0.9877 - val_loss: 0.0747 - val_accuracy: 0.9974 Epoch 81/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0808 - accuracy: 0.9892 - val_loss: 0.0733 - val_accuracy: 0.9974 Epoch 82/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0799 - accuracy: 0.9897 - val_loss: 0.0722 - val_accuracy: 0.9974 Epoch 83/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0790 - accuracy: 0.9880 - val_loss: 0.0722 - val_accuracy: 0.9974 Epoch 84/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0782 - accuracy: 0.9889 - val_loss: 0.0705 - val_accuracy: 0.9974 Epoch 85/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0775 - accuracy: 0.9894 - val_loss: 0.0704 - val_accuracy: 0.9974 Epoch 86/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0765 - accuracy: 0.9889 - val_loss: 0.0683 - val_accuracy: 0.9974 Epoch 87/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0754 - accuracy: 0.9892 - val_loss: 0.0672 - val_accuracy: 0.9974 Epoch 88/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0747 - accuracy: 0.9892 - val_loss: 0.0661 - val_accuracy: 0.9974 Epoch 89/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0739 - accuracy: 0.9900 - val_loss: 0.0653 - val_accuracy: 0.9974 Epoch 90/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0729 - accuracy: 0.9909 - val_loss: 0.0648 - val_accuracy: 0.9974 Epoch 91/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0721 - accuracy: 0.9903 - val_loss: 0.0638 - val_accuracy: 0.9974 Epoch 92/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0714 - accuracy: 0.9903 - val_loss: 0.0635 - val_accuracy: 0.9974 Epoch 93/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0707 - accuracy: 0.9897 - val_loss: 0.0612 - val_accuracy: 0.9923 Epoch 94/100 55/55 [==============================] - 0s 2ms/step - loss: 0.0698 - accuracy: 0.9900 - val_loss: 0.0613 - val_accuracy: 0.9974 Epoch 95/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0687 - accuracy: 0.9914 - val_loss: 0.0599 - val_accuracy: 0.9974 Epoch 96/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0682 - accuracy: 0.9900 - val_loss: 0.0588 - val_accuracy: 0.9923 Epoch 97/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0671 - accuracy: 0.9903 - val_loss: 0.0582 - val_accuracy: 0.9974 Epoch 98/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0661 - accuracy: 0.9914 - val_loss: 0.0566 - val_accuracy: 0.9949 Epoch 99/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0651 - accuracy: 0.9914 - val_loss: 0.0557 - val_accuracy: 0.9974 Epoch 100/100 55/55 [==============================] - 0s 1ms/step - loss: 0.0639 - accuracy: 0.9900 - val_loss: 0.0548 - val_accuracy: 0.9974
print("Predicting.....")
y_pred = model.predict(X_test)
#Converting predictions to label
pred = list()
for i in range(len(y_pred)):
pred.append(np.argmax(y_pred[i]))
print("Done!")
Predicting..... 14/14 [==============================] - 0s 694us/step Done!
#Converting one hot encoded test label to label
test = list()
for i in range(len(Y_test)):
test.append(np.argmax(Y_test[i]))
print("Done!")
Done!
print()
print("Pridicted values")
pred[:10]
Pridicted values
[2, 0, 1, 2, 0, 0, 2, 0, 0, 0]
print()
print("Actual values")
test[:10]
Actual values
[2, 0, 1, 2, 0, 0, 2, 0, 0, 0]
a = accuracy_score(pred,test)
print('Accuracy of model on test data is:', a*100)
Accuracy of model on test data is: 99.76905311778292
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()
# print("Saving model....")
# model.save('./Final_outputs/model.h5');
# print("Model saved")
#Function to predict customer group from user input
def run_model(Recency_pred, Frequency_pred, Monetary_pred):
pred_data = {'Recency':[Recency_pred], 'Frequency':[Frequency_pred], 'Monetary':[Monetary_pred] }
pred_data=pd.DataFrame(pred_data)
print()
print("UNSCALED DATA SET")
print(pred_data)
pred_data = pd.DataFrame(scaler.transform(pred_data), columns=pred_data.columns)
print()
print("SCALED DATA SET")
print(pred_data)
print()
print("PREDICTING......")
X1=np.array(pred_data)
print(X1)
result = model.predict(X1)
k = np.argmax(result)
out = Output(layout={'border': '1px solid white'})
with out:
print(k)
return out
interact(run_model, Recency_pred = BoundedFloatText(value = 0, min = 0, max=373, step = 1, description = 'Recency'),
Frequency_pred = BoundedFloatText(value = 0, min = 1, step = 1, max=1402, description = 'Frequency'),
Monetary_pred = BoundedFloatText(value = 2.9, min = 2.9, step = 1,max=279765.02, description = 'Monetary'),)
interactive(children=(BoundedFloatText(value=0.0, description='Recency', max=373.0, step=1.0), BoundedFloatTex…
<function __main__.run_model(Recency_pred, Frequency_pred, Monetary_pred)>